import pandas as pd
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics
df1 = pd.read_csv('data.csv')
df1.head()
#15 Points for Data Preprocessing
df1.isnull().sum()
#split dataset in features and target variable
#removed artist and song_title because all values are unique,
#cannot use it to create decision tree
feature_cols = ['acousticness','danceability','duration_ms','energy','instrumentalness','key','liveness','loudness','mode','speechiness','tempo','time_signature','valence']
X = df1[feature_cols]
y = df1.target
import matplotlib.pyplot as plt
df1.plot.box(figsize=(12,8))
plt.xticks(
list(range(1, len(df1.columns)+1)),
df1.columns,
rotation='vertical')
df1.corr()
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) # 70% training and 30% test
from sklearn import tree
import matplotlib.pyplot as plt
# Create Decision Tree classifer object
clf = tree.DecisionTreeClassifier()
# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)
#Predict the response for test dataset
y_pred = clf.predict(X_test)
# plot Decision tree
tree.plot_tree(clf,filled=True)
plt.rcParams["figure.figsize"] = (500,100)
plt.savefig('tree.pdf')
plt.show()
from sklearn import metrics
print("Accuracy: ",metrics.accuracy_score(y_test, y_pred))
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from pprint import pprint
scalar = StandardScaler()
c = DecisionTreeClassifier(min_samples_split = 100)
# The minimum value to split value to split the samples is used as 100, based on the data size.
# If the minimum split value is a less value the dense decision tree will be generated.
X = df1[feature_cols]
y = df1.target
depth = []
d_scores = []
for i in range(3,30):
c = DecisionTreeClassifier(max_depth=i)
# Perform 7-fold cross validation
pipeline = Pipeline([('transformer', scalar), ('estimator', c)])
scores = cross_val_score(pipeline, X=X, y=y, cv=7, n_jobs=4)
depth.append((i,scores.mean()))
d_scores.append((i,scores))
pprint(depth)
for ea in d_scores:
print("height = ", ea[0]," -> \n", ea[1], '\n')
# Height 5
k_fold_summary = []
for i in range(3,15):
c = DecisionTreeClassifier(max_depth=5)
# Perform 7-fold cross validation
pipeline = Pipeline([('transformer', scalar), ('estimator', c)])
scores = cross_val_score(pipeline, X=X, y=y, cv=i, n_jobs=4)
k_fold_summary.append((i,scores.mean()))
# d_scores.append((i,scores))
pprint(k_fold_summary)
k = 4, results in high accuracy which is very close k = 7,
however due to the larger computation requirement, k= 4 will be preffered